% This script imputes characteristics of 'pseudo members' in small classes;
% and demeans all variables, y and x
% Date: 2019-1-26
function [Yt,clsXt,indXt,lb,ub,meanVI,meanVC,meanY,OYt,OclsXt,OindXt] = manage_STAR_data(clsData,clsSize); 
global varIndiv varClass indY % declare global var (defined in ...)
% INPUTS
% - clsData: a 1-by-C cell, with each component being data from a class
% - clsSize: an C-vector keeping track of the size of each class
% OUTPUTS
% - Yt: demeaned outcome 
% - clsXt: demeaned class char.
% - indXt: demeaned individual char.
% - meanVI, meanVC, meanY: conditional mean of regressors and outcome given group size
% - OYt: original, undemeaned outcome 
% - OclsXt: original, demeaned class char.
% - OindXt: original, demeaned individual char.
lb = min(clsSize);  ub = max(clsSize); % smallest and largest class sizes
C = length(clsSize); % # of classes
K = size(clsData{1},2); % dimension of regressors
%% ---- impute characteristics of pseudo-members in smaller class ---- 
% pool observations together; pseudo values are drawn from these obs (see Section 5.3 in LQT2018 for details)
tM = []; for ci = 1:C, tM = [tM; clsData{ci}]; end; 
for ki = 1:K, 
    pseudoPopu{ki} = tM(~isnan(tM(:,ki)),ki); 
end; clear tM tV; rng(5, 'twister');
for ci = 1:C; % loop in class 
    n = clsSize(ci);
    xt{ci} = []; % zeros(ub,length(varIndiv));
    for ki = varIndiv, % loop in student-level variables         
        xt{ci} =[xt{ci}, [clsData{ci}(:,ki); randsample(pseudoPopu{ki},ub-n,0)] ];
    end;
end;
%% ---- calculate conditional means of dependent and explanatory var given class size ----
% for class-level variables
meanVC  = zeros(ub-lb+1,length(varClass)); % class variables 
for ik = 1:length(varClass), % loop in class-level characteristics
    for si = lb:ub, % loop in class size         
        varSum = 0; count = 0;
        for ci = 1:C, % loop in classes in data
            % sind = classInd(ci);
            if size(clsData{ci},1) == si & ...
                    ~isnan(clsData{ci}(1,varClass(ik))),
                varSum = varSum + clsData{ci}(1,varClass(ik));
                count = count + 1;
            end;            
        end;
        meanVC(si-lb+1,ik) = varSum/count;
    end;
end;
% for individual-level and outcome variables
meanVI = zeros(ub-lb+1,ub,length(varIndiv)); % individual variables
meanY  = zeros(ub-lb+1,lb); % outcome variables
for ii = 1:ub, % loop in individual labels
     for si = lb:ub, % loop in class size         
         varSum  = zeros(length(varIndiv),1); 
         count   = varSum; varSum2 = 0; count2 = 0; 
         for ci = 1:C, % loop in classes in data
                 % sind = classInd(ci);
                 for ik = 1:length(varIndiv), % loop in individual chara
                     if size(clsData{ci},1) == si & ...
                             ~isnan(xt{ci}(ii,ik)),
                         varSum(ik) = varSum(ik) + xt{ci}(ii,ik);
                         count(ik) = count(ik) + 1;
                     end;            
                 end;
                 if ii < lb+1,
                     if size(clsData{ci},1) == si & ...
                             ~isnan(clsData{ci}(ii,indY));
                         varSum2 = varSum2 + clsData{ci}(ii,indY);
                         count2 = count2+1;
                     end;
                 end;
         end;         
         if max(count==0), zind = find(count==0);
            varSum(zind) = varSum(zind-1); count(zind)=count(zind-1);
         end;
         % meanVI(si-lb+1,ii,:)  = varSum./(count);                
         if ii<lb+1,
            meanY(si-lb+1,ii)  = varSum2/count2;            
         end;
     end;        
end;

%% ---- demean dep. and indep. variables and reorganize ---- 
Yt = zeros(lb,C);  OYt = Yt;
clsXt = zeros(length(varClass),C);  OclsXt = clsXt;
indXt = zeros(length(varIndiv),ub,C); OindXt = indXt;
for ci = 1:C, % loop in classes observed in data
    % sind    = classInd(ci); 
    sz      = clsSize(ci);
    for ik = 1:length(varClass),
        % demean class variables
         clsXt(ik,ci) = clsData{ci}(1,varClass(ik)) - meanVC(sz-lb+1,ik); 
        OclsXt(ik,ci) = clsData{ci}(1,varClass(ik));
    end;
    for ii = 1:lb,
        % demean outcome variables
         Yt(ii,ci) = clsData{ci}(ii,indY) - meanY(sz-lb+1,ii);  
        OYt(ii,ci) = clsData{ci}(ii,indY);  
    end;
    for ii = 1:ub,
        for ik = 1:length(varIndiv),  
            % demean individual variables
            % indXt(ki,ii,ci) = classData{sind}(ii,varIndiv(ki)) - meanVI(PN(:,1)==sz,ii,ki); 
             indXt(ik,ii,ci) = xt{ci}(ii,ik) - meanVI(sz-lb+1,ii,ik); 
            OindXt(ik,ii,ci) = xt{ci}(ii,ik);
        end;    
    end;
end;
